clear all

*Define global path for replication package
global path "~/Dropbox/IT_Revolution/Replication_package/JPE submission"

global path_rawdata "$path/Raw_data"
global path_cleandata "$path/Clean_data"
global path_output "$path/Output"

********************************************************************************
*** Manufacturing (1900-1940)
********************************************************************************

*** Step 1: Data cleaning
********************************************************************************
use "$path_rawdata/raw_data_impus_Manuf.dta", clear

*select sample of employed males aged 16-64
drop if age < 16 | age > 64 
drop if labforce == 1

*keep valid occupations
rename occ occIPUMS
keep if occ1950 < 979
rename occ1950 occ

*generate dummy variables
gen manuf = (ind1950 > 300 & ind1950 < 500)
gen white = race == 1
gen USborn = bpl < 150 

*generate employment variable for aggregation
gen emp = 1

save "$path_cleandata/data_clean_1900_1940.dta", replace

*** Step 2: Create occupation exposure 
********************************************************************************
use "$path_cleandata/data_clean_1900_1940.dta", clear

keep if year == 1900
keep if sex == 1

merge m:1 ind1950 using "$path_cleandata/electricity_shares"
drop if _m < 3

collapse (mean) elect_share_noutil [aw = perwt], by(occ year) fast

save "$path_cleandata/exposureManuf.dta", replace

*** Step 3: Generate sample by occupation and group
********************************************************************************

foreach spec of numlist 1/3 {
	use "$path_cleandata/data_clean_1900_1940.dta", clear 

	if `spec' == 1 {
	    keep if sex == 1
		local age_range 25/35
	}
	if `spec' == 2 {
	    keep if sex == 1
		keep if USborn == 1
		local age_range 29
	}	
	if `spec' == 3 {
	    keep if sex == 1
		keep if white == 1
		local age_range 29
	}

	*generate occupation dataset for all workers
	preserve
		gen group = 0
		 
		collapse (sum) emp (mean) manuf [aw = perwt], by(occ year group) fast
									
		save "$path_cleandata/data_occ_1900_1940_s`spec'_all.dta", replace						
	restore 
									
	*create data by age group
	foreach age of numlist `age_range' {
		
	preserve
		gen group = 1 if age <= `age'
		replace group = 2 if age > `age'

		collapse (sum) emp (mean) manuf [aw = perwt], by(occ year group) fast
									
		append using "$path_cleandata/data_occ_1900_1940_s`spec'_all.dta"
								
				
		reshape wide emp manuf, i(occ group) j(year)
		merge m:1 occ using "$path_cleandata/exposureManuf.dta"
									
		save "$path_cleandata/data_occ_1900_1940_s`spec'_a`age'.dta", replace
	restore 
	}

	erase "$path_cleandata/data_occ_1900_1940_s`spec'_all.dta"
}

*** Step 4: Bilateral distance between occupations
********************************************************************************
use "$path_rawdata/occ1990dd_task_alm.dta",clear

gen temp = task_abstract+task_routine+task_manual

foreach var of varlist task_abstract task_routine task_manual{
	replace `var' = `var'/temp
}
drop temp

save "$path_cleandata/temp_pre", replace

use "$path_cleandata/data_occ_1900_1940_s1_a29.dta", clear

*select occupations with positive employment in 1900 and 1940
gen change_emp4 = log(emp1940/emp1900)
gen aind = change_emp4 != .
egen ind = sum(aind), by(occ)
keep if ind == 3
keep if aind == 1

*standardized exposure measures
egen exposure_manuf = std(manuf1900)
egen exp_elec = std(elect_share_noutil)

keep if group==0
drop group _merge

merge m:1 occ using "$path_rawdata/occ1950_occ1990dd.dta"
drop if _merge < 3
drop _m

collapse (sum) emp* (mean) exposure_manuf exp_elec, by(occ1990dd)
rename exposure exposure_m
sum exposure_m, det

merge 1:1 occ1990dd using "$path_cleandata/temp_pre"
keep if _m==3
drop _m

gen merge_var = 1

preserve

	foreach var of varlist emp* exp* occ task*{
		rename `var' `var'_2
	}

	save "$path_cleandata/temp", replace
	
restore

joinby merge_var using "$path_cleandata/temp"

drop if occ1990dd==occ1990dd_2
drop merge_var

gen d_ent = 0
foreach var of varlist task_abstract task_manual task_routine{
	replace d_ent = d_ent + `var'*log(`var'/`var'_2)
}
drop task*

save "$path_cleandata/temp_distance_all_early.dta", replace

*** Step 5: Compute average distance metric
********************************************************************************
use "$path_cleandata/temp_distance_all_early.dta", clear

collapse (mean) ad_ent=d_ent  [aw = emp1900], by(occ1990dd_2)

rename occ1990dd_2 occ1990dd

save "$path_cleandata/distance_temp.dta", replace

foreach spec of numlist 1/3 {
	use "$path_cleandata/data_occ_1900_1940_s`spec'_a29.dta", clear 
	
	drop _m 
	merge m:1 occ using "$path_rawdata/occ1950_occ1990dd.dta"
	drop if _merge < 3
	drop _m
	
	merge m:1 occ1990dd using  "$path_cleandata/distance_temp.dta"
	drop _m
		
	save "$path_cleandata/data_occ_1900_1940_s`spec'_a29distance.dta", replace
}
erase "$path_cleandata/distance_temp.dta"
erase "$path_cleandata/temp.dta"
erase "$path_cleandata/temp_pre.dta"
